## Pandya 2010 IO

library(foreign)
library(Amelia)

## 1995
## Load original dataset
p2010_1995 <- read.dta("P2010 IO Rep Data 1995.dta")
head(p2010_1995)
dim(p2010_1995)
summary(p2010_1995)

## Drop ID and other index vars
p2010_1995$year <- NULL

## Drop character variables
p2010_1995$s11a <- p2010_1995$s11 <-p2010_1995$s10 <-p2010_1995$s11c <-p2010_1995$p56 <-p2010_1995$p11 <-p2010_1995$p57 <- p2010_1995$s6 <-p2010_1995$s10b <-p2010_1995$s14 <- p2010_1995$p16 <-p2010_1995$p18 <-p2010_1995$p19 <-p2010_1995$p27f <-p2010_1995$p27k <- p2010_1995$p47a <-p2010_1995$p59 <-p2010_1995$p66a <-p2010_1995$s10_1 <-p2010_1995$p66b <-p2010_1995$p73 <-p2010_1995$s11d <-p2010_1995$s12b <- p2010_1995$occupation <- NULL

## Drop derived dummy vars
which( colnames(p2010_1995)=="_Ipais_8" )
which( colnames(p2010_1995)=="_Ipais_7" )
which( colnames(p2010_1995)=="_Ipais_6" )
which( colnames(p2010_1995)=="_Ipais_5" )
which( colnames(p2010_1995)=="_Ipais_4" )
which( colnames(p2010_1995)=="_Ipais_3" )
which( colnames(p2010_1995)=="_Ipais_2" )

p2010_1995 <- p2010_1995[-c(74, 75, 76, 77, 78, 79, 80)]

## How many variables? 77: no reduction necessary
dim(p2010_1995)

## Imputation
case <-c(1:nrow(p2010_1995))
p2010_1995 <- cbind(p2010_1995, case)
head(p2010_1995)

## What is average percentage of missing data?
NAs <- function(x) {
    as.vector(apply(x, 2, function(x) length(which(is.na(x)))))
    }
NAs(p2010_1995)
mean(NAs(p2010_1995)/nrow(p2010_1995))*100

## Thus: 7 imputations

set.seed(02138)
p2010_1995.out <- amelia(p2010_1995, m = 7, cs = "pais", empri = 0.01*nrow(p2010_1995))

write.amelia(obj=p2010_1995.out, file.stem = "P2010 IO Imp Data 1995", format = "dta", separate = FALSE)

## 1998
## Load original dataset
p2010_1998 <- read.dta("P2010 IO Rep Data 1998.dta")
head(p2010_1998)
dim(p2010_1998)
summary(p2010_1998)

## Drop ID vars
p2010_1998$year <- p2010_1998$idenpa <- NULL

## Drop character variables
p2010_1998$np14a <- p2010_1998$np14b <- p2010_1998$np14c <- p2010_1998$np14d <- p2010_1998$np14e <-p2010_1998$np14f <- p2010_1998$np14g <- p2010_1998$np15 <- p2010_1998$np17 <- p2010_1998$np16 <- p2010_1998$sp44a <- p2010_1998$sp44b <- p2010_1998$sp44c <- p2010_1998$sp44d <- p2010_1998$sp46  <-p2010_1998$sp78 <- p2010_1998$sp79 <- p2010_1998$s1 <-p2010_1998$s5 <- p2010_1998$s7a <- p2010_1998$s7b <-p2010_1998$s8 <- p2010_1998$s8c <- p2010_1998$s8d <-p2010_1998$s11 <- p2010_1998$s13 <- p2010_1998$s14a <-p2010_1998$s14c <- p2010_1998$s16a <- p2010_1998$s16b <-p2010_1998$s17 <- p2010_1998$s17d <- p2010_1998$s17c <-p2010_1998$occupation <- NULL

## Drop vars with no variation
p2010_1998$support_market <- NULL

which( colnames(p2010_1998)=="_Ipais_8" )
which( colnames(p2010_1998)=="_Ipais_7" )
which( colnames(p2010_1998)=="_Ipais_6" )
which( colnames(p2010_1998)=="_Ipais_5" )
which( colnames(p2010_1998)=="_Ipais_4" )
which( colnames(p2010_1998)=="_Ipais_3" )
which( colnames(p2010_1998)=="_Ipais_2" )
which( colnames(p2010_1998)=="_Ipais_9" )
which( colnames(p2010_1998)=="_Ipais_10" )
which( colnames(p2010_1998)=="_Ipais_11" )
which( colnames(p2010_1998)=="_Ipais_12" )
which( colnames(p2010_1998)=="_Ipais_13" )
which( colnames(p2010_1998)=="_Ipais_15" )
which( colnames(p2010_1998)=="_Ipais_16" )
which( colnames(p2010_1998)=="_Ipais_17" )

p2010_1998 <- p2010_1998[-c(58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73)]

## How many variables? 64: no reduction necessary
dim(p2010_1998)

## Imputation
case <-c(1:nrow(p2010_1998))
p2010_1998  <- cbind(p2010_1998, case)
head(p2010_1998)

## What is average percentage of missing data?
NAs <- function(x) {
    as.vector(apply(x, 2, function(x) length(which(is.na(x)))))
    }
NAs(p2010_1998)
mean(NAs(p2010_1998)/nrow(p2010_1998))*100

## Thus: 7 imputations

set.seed(02138)
p2010_1998.out <- amelia(p2010_1998, m = 7, cs = "pais",  empri = 0.01*nrow(p2010_1998))

write.amelia(obj=p2010_1998.out, file.stem = "P2010 Imp Data 1998", format = "dta", separate = FALSE)

## 2001
## Load original dataset
p2010_2000 <- read.dta("P2010 IO Rep Data 2001.dta")
head(p2010_2000)
dim(p2010_2000)
summary(p2010_2000)

## Drop ID vars
p2010_2000$idenpa <- p2010_2000$year <- NULL

## Drop character variables 
p2010_2000$p11st <- p2010_2000$p15sta <- p2010_2000$p15std <- p2010_2000$p15stf <- p2010_2000$p32n <- p2010_2000$p32n <- p2010_2000$p33st <- p2010_2000$p34st <- p2010_2000$p35st <- p2010_2000$p54st <- p2010_2000$p73st <- p2010_2000$s1  <- p2010_2000$s4 <- p2010_2000$s6 <- p2010_2000$s8a <- p2010_2000$s8b <- p2010_2000$s9 <- p2010_2000$s17 <- p2010_2000$s18a <- p2010_2000$s18b <- p2010_2000$s19 <- p2010_2000$reeduc1 <- p2010_2000$reeduc3 <- p2010_2000$occupation <- NULL

## Drop derived vars
which( colnames(p2010_2000)=="_Ipais_8" )
which( colnames(p2010_2000)=="_Ipais_7" )
which( colnames(p2010_2000)=="_Ipais_6" )
which( colnames(p2010_2000)=="_Ipais_5" )
which( colnames(p2010_2000)=="_Ipais_4" )
which( colnames(p2010_2000)=="_Ipais_3" )
which( colnames(p2010_2000)=="_Ipais_2" )
which( colnames(p2010_2000)=="_Ipais_9" )
which( colnames(p2010_2000)=="_Ipais_10" )
which( colnames(p2010_2000)=="_Ipais_11" )
which( colnames(p2010_2000)=="_Ipais_12" )
which( colnames(p2010_2000)=="_Ipais_13" )
which( colnames(p2010_2000)=="_Ipais_15" )
which( colnames(p2010_2000)=="_Ipais_16" )
which( colnames(p2010_2000)=="_Ipais_17" )

p2010_2000 <- p2010_2000[-c(50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65)]

## How many variables? 50: no reduction necessary
dim(p2010_2000)

## Imputation
case <-c(1:nrow(p2010_2000))
p2010_2000  <- cbind(p2010_2000, case)
head(p2010_2000)

## What is average percentage of missing data?
NAs <- function(x) {
    as.vector(apply(x, 2, function(x) length(which(is.na(x)))))
    }
NAs(p2010_2000)
mean(NAs(p2010_2000)/nrow(p2010_2000))*100

## Thus: 5 imputations

set.seed(02138)
p2010_2000.out <- amelia(p2010_2000, m = 5, cs = "pais", empri = 0.01*nrow(p2010_2000))

write.amelia(obj= p2010_2000.out, file.stem = "P2010 IO Imp Data 2001", format = "dta", separate = FALSE)